{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature Engineering for High-Frequency Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports & Settings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:13:13.174416Z",
     "start_time": "2020-06-25T16:13:13.170652Z"
    }
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:13:14.418084Z",
     "start_time": "2020-06-25T16:13:13.845173Z"
    }
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "from pathlib import Path\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from scipy.stats import spearmanr\n",
    "import talib\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:13:16.061429Z",
     "start_time": "2020-06-25T16:13:16.059565Z"
    }
   },
   "outputs": [],
   "source": [
    "sns.set_style('whitegrid')\n",
    "idx = pd.IndexSlice"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data prep"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We use the 'Trade and Quote' dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:13:55.136832Z",
     "start_time": "2020-06-25T16:13:55.130068Z"
    }
   },
   "outputs": [],
   "source": [
    "as_path = Path('../data/nasdaq100')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:13:55.436776Z",
     "start_time": "2020-06-25T16:13:55.431542Z"
    }
   },
   "outputs": [],
   "source": [
    "tcols = ['openbartime', 'firsttradetime',\n",
    "         'highbidtime', 'highasktime', 'hightradetime',\n",
    "         'lowbidtime', 'lowasktime', 'lowtradetime',\n",
    "         'closebartime', 'lasttradetime']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:13:57.248661Z",
     "start_time": "2020-06-25T16:13:57.246885Z"
    }
   },
   "outputs": [],
   "source": [
    "drop_cols = ['unknowntickvolume',\n",
    "             'cancelsize',\n",
    "             'tradeatcrossorlocked']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:13:57.547662Z",
     "start_time": "2020-06-25T16:13:57.543435Z"
    }
   },
   "outputs": [],
   "source": [
    "keep = ['firsttradeprice', 'hightradeprice', 'lowtradeprice', 'lasttradeprice', \n",
    "        'minspread', 'maxspread',\n",
    "        'volumeweightprice', 'nbboquotecount', \n",
    "        'tradeatbid', 'tradeatbidmid', 'tradeatmid', 'tradeatmidask', 'tradeatask', \n",
    "        'volume', 'totaltrades', 'finravolume', \n",
    "        'finravolumeweightprice', \n",
    "        'uptickvolume', 'downtickvolume', \n",
    "        'repeatuptickvolume', 'repeatdowntickvolume', \n",
    "        'tradetomidvolweight', 'tradetomidvolweightrelative']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:13:57.885629Z",
     "start_time": "2020-06-25T16:13:57.878067Z"
    }
   },
   "outputs": [],
   "source": [
    "columns = {'volumeweightprice'          : 'price',\n",
    "           'finravolume'                : 'fvolume',\n",
    "           'finravolumeweightprice'     : 'fprice',\n",
    "           'uptickvolume'               : 'up',\n",
    "           'downtickvolume'             : 'down',\n",
    "           'repeatuptickvolume'         : 'rup',\n",
    "           'repeatdowntickvolume'       : 'rdown',\n",
    "           'firsttradeprice'            : 'first',\n",
    "           'hightradeprice'             : 'high',\n",
    "           'lowtradeprice'              : 'low',\n",
    "           'lasttradeprice'             : 'last',\n",
    "           'nbboquotecount'             : 'nbbo',\n",
    "           'totaltrades'                : 'ntrades',\n",
    "           'openbidprice'               : 'obprice',\n",
    "           'openbidsize'                : 'obsize',\n",
    "           'openaskprice'               : 'oaprice',\n",
    "           'openasksize'                : 'oasize',\n",
    "           'highbidprice'               : 'hbprice',\n",
    "           'highbidsize'                : 'hbsize',\n",
    "           'highaskprice'               : 'haprice',\n",
    "           'highasksize'                : 'hasize',\n",
    "           'lowbidprice'                : 'lbprice',\n",
    "           'lowbidsize'                 : 'lbsize',\n",
    "           'lowaskprice'                : 'laprice',\n",
    "           'lowasksize'                 : 'lasize',\n",
    "           'closebidprice'              : 'cbprice',\n",
    "           'closebidsize'               : 'cbsize',\n",
    "           'closeaskprice'              : 'caprice',\n",
    "           'closeasksize'               : 'casize',\n",
    "           'firsttradesize'             : 'firstsize',\n",
    "           'hightradesize'              : 'highsize',\n",
    "           'lowtradesize'               : 'lowsize',\n",
    "           'lasttradesize'              : 'lastsize',\n",
    "           'tradetomidvolweight'        : 'volweight',\n",
    "           'tradetomidvolweightrelative': 'volweightrel'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:14:37.426223Z",
     "start_time": "2020-06-25T16:14:37.424431Z"
    }
   },
   "outputs": [],
   "source": [
    "parquet_path = as_path / '1min_taq' / 'parquet'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:14:47.518888Z",
     "start_time": "2020-06-25T16:14:47.501188Z"
    }
   },
   "outputs": [],
   "source": [
    "files = list(parquet_path.glob('*.parquet'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:14:50.333170Z",
     "start_time": "2020-06-25T16:14:50.314044Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[PosixPath('../data/nasdaq100/1min_taq/parquet/20140421.parquet'),\n",
       " PosixPath('../data/nasdaq100/1min_taq/parquet/20130905.parquet'),\n",
       " PosixPath('../data/nasdaq100/1min_taq/parquet/20151117.parquet'),\n",
       " PosixPath('../data/nasdaq100/1min_taq/parquet/20150629.parquet'),\n",
       " PosixPath('../data/nasdaq100/1min_taq/parquet/20170817.parquet')]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "files[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:15:13.118920Z",
     "start_time": "2020-06-25T16:15:12.907839Z"
    }
   },
   "outputs": [],
   "source": [
    "df = pd.read_parquet(files[0]).drop(tcols + drop_cols, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:15:14.954614Z",
     "start_time": "2020-06-25T16:15:14.915380Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 91735 entries, ('PAYX', Timestamp('2014-04-21 04:00:00')) to ('GOOG', Timestamp('2014-04-21 20:00:00'))\n",
      "Data columns (total 43 columns):\n",
      " #   Column                       Non-Null Count  Dtype  \n",
      "---  ------                       --------------  -----  \n",
      " 0   openbidprice                 89735 non-null  float64\n",
      " 1   openbidsize                  89735 non-null  float64\n",
      " 2   openaskprice                 88141 non-null  float64\n",
      " 3   openasksize                  88141 non-null  float64\n",
      " 4   firsttradeprice              41715 non-null  float64\n",
      " 5   firsttradesize               41715 non-null  float64\n",
      " 6   highbidprice                 89836 non-null  float64\n",
      " 7   highbidsize                  89836 non-null  float64\n",
      " 8   highaskprice                 88242 non-null  float64\n",
      " 9   highasksize                  88242 non-null  float64\n",
      " 10  hightradeprice               41715 non-null  float64\n",
      " 11  hightradesize                41715 non-null  float64\n",
      " 12  lowbidprice                  89836 non-null  float64\n",
      " 13  lowbidsize                   89836 non-null  float64\n",
      " 14  lowaskprice                  88242 non-null  float64\n",
      " 15  lowasksize                   88242 non-null  float64\n",
      " 16  lowtradeprice                41715 non-null  float64\n",
      " 17  lowtradesize                 41715 non-null  float64\n",
      " 18  closebidprice                89836 non-null  float64\n",
      " 19  closebidsize                 89836 non-null  float64\n",
      " 20  closeaskprice                88242 non-null  float64\n",
      " 21  closeasksize                 88242 non-null  float64\n",
      " 22  lasttradeprice               41715 non-null  float64\n",
      " 23  lasttradesize                41715 non-null  float64\n",
      " 24  minspread                    86247 non-null  float64\n",
      " 25  maxspread                    86252 non-null  float64\n",
      " 26  volumeweightprice            40124 non-null  float64\n",
      " 27  nbboquotecount               91735 non-null  int64  \n",
      " 28  tradeatbid                   91735 non-null  int64  \n",
      " 29  tradeatbidmid                91735 non-null  int64  \n",
      " 30  tradeatmid                   91735 non-null  int64  \n",
      " 31  tradeatmidask                91735 non-null  int64  \n",
      " 32  tradeatask                   91735 non-null  int64  \n",
      " 33  volume                       91735 non-null  int64  \n",
      " 34  totaltrades                  91735 non-null  int64  \n",
      " 35  finravolume                  91735 non-null  int64  \n",
      " 36  finravolumeweightprice       37663 non-null  float64\n",
      " 37  uptickvolume                 91735 non-null  int64  \n",
      " 38  downtickvolume               91735 non-null  int64  \n",
      " 39  repeatuptickvolume           91735 non-null  int64  \n",
      " 40  repeatdowntickvolume         91735 non-null  int64  \n",
      " 41  tradetomidvolweight          40124 non-null  float64\n",
      " 42  tradetomidvolweightrelative  40124 non-null  float64\n",
      "dtypes: float64(30), int64(13)\n",
      "memory usage: 30.4+ MB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:41.899399Z",
     "start_time": "2020-06-20T16:19:41.885960Z"
    }
   },
   "outputs": [],
   "source": [
    "def data_to_hdf():\n",
    "    parquet_path = as_path / '1min_taq' / 'parquet'\n",
    "    files = list(parquet_path.glob('*.parquet'))\n",
    "    for year in range(2013, 2018):\n",
    "        print(year)\n",
    "        data = []\n",
    "        for f in files:\n",
    "            if f.stem.startswith(str(year)):\n",
    "                # print(f.stem, end=' ', flush=True)\n",
    "                data.append(pd.read_parquet(f).drop(tcols + drop_cols, axis=1))\n",
    "        data = (pd.concat(data).sort_index()\n",
    "                .reset_index('ticker')\n",
    "                .between_time('9:30', '16:00')\n",
    "                .set_index('ticker', append=True)\n",
    "                .swaplevel()\n",
    "                .rename(columns=columns)\n",
    "                .rename(columns=lambda x: x.replace('tradeat', 'at')))\n",
    "        print(data.info(null_counts=True))\n",
    "        data.to_hdf('algoseek.h5', f'data/{year}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:41.910256Z",
     "start_time": "2020-06-20T16:19:41.900798Z"
    }
   },
   "outputs": [],
   "source": [
    "data_to_hdf()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-25T16:16:20.444276Z",
     "start_time": "2020-06-25T16:16:10.229101Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "MultiIndex: 9853200 entries, ('AAPL', Timestamp('2013-01-02 09:30:00')) to ('YHOO', Timestamp('2013-12-31 16:00:00'))\n",
      "Data columns (total 43 columns):\n",
      " #   Column        Dtype  \n",
      "---  ------        -----  \n",
      " 0   obprice       float64\n",
      " 1   obsize        float64\n",
      " 2   oaprice       float64\n",
      " 3   oasize        float64\n",
      " 4   first         float64\n",
      " 5   firstsize     float64\n",
      " 6   hbprice       float64\n",
      " 7   hbsize        float64\n",
      " 8   haprice       float64\n",
      " 9   hasize        float64\n",
      " 10  high          float64\n",
      " 11  highsize      float64\n",
      " 12  lbprice       float64\n",
      " 13  lbsize        float64\n",
      " 14  laprice       float64\n",
      " 15  lasize        float64\n",
      " 16  low           float64\n",
      " 17  lowsize       float64\n",
      " 18  cbprice       float64\n",
      " 19  cbsize        float64\n",
      " 20  caprice       float64\n",
      " 21  casize        float64\n",
      " 22  last          float64\n",
      " 23  lastsize      float64\n",
      " 24  minspread     float64\n",
      " 25  maxspread     float64\n",
      " 26  price         float64\n",
      " 27  nbbo          int64  \n",
      " 28  atbid         int64  \n",
      " 29  atbidmid      int64  \n",
      " 30  atmid         int64  \n",
      " 31  atmidask      int64  \n",
      " 32  atask         int64  \n",
      " 33  volume        int64  \n",
      " 34  ntrades       int64  \n",
      " 35  fvolume       int64  \n",
      " 36  fprice        float64\n",
      " 37  up            int64  \n",
      " 38  down          int64  \n",
      " 39  rup           int64  \n",
      " 40  rdown         int64  \n",
      " 41  volweight     float64\n",
      " 42  volweightrel  float64\n",
      "dtypes: float64(30), int64(13)\n",
      "memory usage: 3.2+ GB\n"
     ]
    }
   ],
   "source": [
    "with pd.HDFStore('algoseek.h5') as store:\n",
    "    df = store['data/2013']\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Loading Algoseek Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:41.918678Z",
     "start_time": "2020-06-20T16:19:41.911408Z"
    }
   },
   "outputs": [],
   "source": [
    "ohlcv_cols = ['first', 'high', 'low', 'last', 'price', 'volume']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:41.926726Z",
     "start_time": "2020-06-20T16:19:41.919591Z"
    }
   },
   "outputs": [],
   "source": [
    "data_cols = ohlcv_cols + ['up', 'down', 'rup', 'rdown', 'atask', 'atbid']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.043687Z",
     "start_time": "2020-06-20T16:19:41.927914Z"
    }
   },
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'No object named data/2013 in the file'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-13-9b1d9cd5e0cb>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mHDFStore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'algoseek.h5'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mstore\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     df = (pd.concat([store[f'data/{year}']\n\u001b[0;32m----> 4\u001b[0;31m                      .loc[:, data_cols] for year in years])\n\u001b[0m\u001b[1;32m      5\u001b[0m           .sort_index())\n",
      "\u001b[0;32m<ipython-input-13-9b1d9cd5e0cb>\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mHDFStore\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'algoseek.h5'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mstore\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     df = (pd.concat([store[f'data/{year}']\n\u001b[0;32m----> 4\u001b[0;31m                      .loc[:, data_cols] for year in years])\n\u001b[0m\u001b[1;32m      5\u001b[0m           .sort_index())\n",
      "\u001b[0;32m~/.pyenv/versions/miniconda3-latest/envs/ml4t/lib/python3.7/site-packages/pandas/io/pytables.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m    551\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    552\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 553\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    554\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    555\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__setitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.pyenv/versions/miniconda3-latest/envs/ml4t/lib/python3.7/site-packages/pandas/io/pytables.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m    744\u001b[0m         \u001b[0mgroup\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_node\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    745\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mgroup\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 746\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mKeyError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"No object named {key} in the file\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    747\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_read_group\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgroup\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    748\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mKeyError\u001b[0m: 'No object named data/2013 in the file'"
     ]
    }
   ],
   "source": [
    "years = range(2013, 2018)\n",
    "with pd.HDFStore('algoseek.h5') as store:\n",
    "    df = (pd.concat([store[f'data/{year}']\n",
    "                     .loc[:, data_cols] for year in years])\n",
    "          .sort_index())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.045151Z",
     "start_time": "2020-06-20T16:19:41.396Z"
    }
   },
   "outputs": [],
   "source": [
    "df.loc[:, ohlcv_cols[:4]] = df.loc[:, ohlcv_cols[:4]].groupby('ticker').fillna(method='ffill')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.045645Z",
     "start_time": "2020-06-20T16:19:41.398Z"
    }
   },
   "outputs": [],
   "source": [
    "df.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.046254Z",
     "start_time": "2020-06-20T16:19:41.400Z"
    }
   },
   "outputs": [],
   "source": [
    "df.to_hdf('hf_data.h5', 'data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.046797Z",
     "start_time": "2020-06-20T16:19:41.403Z"
    }
   },
   "outputs": [],
   "source": [
    "df = pd.read_hdf('hf_data.h5', 'data')\n",
    "# .loc[idx['AAPL', '2013'], :]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Engineering"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "All of the features above were normalized in a standard fashion\n",
    "by subtracting their means, dividing by their standard deviations, and time-averaging over a recent\n",
    "interval. In order to obtain a finite state space, features were discretized into bins in multiples of\n",
    "standard deviation units"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.047466Z",
     "start_time": "2020-06-20T16:19:41.406Z"
    }
   },
   "outputs": [],
   "source": [
    "df = df.sort_index()\n",
    "df['date'] = pd.to_datetime(df.index.get_level_values('date_time').date)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.047923Z",
     "start_time": "2020-06-20T16:19:41.408Z"
    }
   },
   "outputs": [],
   "source": [
    "df.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.048354Z",
     "start_time": "2020-06-20T16:19:41.410Z"
    }
   },
   "outputs": [],
   "source": [
    "by_ticker = df.groupby('ticker', group_keys=False)\n",
    "by_ticker_date = df.groupby(['ticker', 'date'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.048939Z",
     "start_time": "2020-06-20T16:19:41.413Z"
    }
   },
   "outputs": [],
   "source": [
    "data = pd.DataFrame(index=df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.049693Z",
     "start_time": "2020-06-20T16:19:41.415Z"
    }
   },
   "outputs": [],
   "source": [
    "data['date'] = pd.factorize(df['date'], sort=True)[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.050376Z",
     "start_time": "2020-06-20T16:19:41.417Z"
    }
   },
   "outputs": [],
   "source": [
    "data['minute'] = pd.to_timedelta(data.index.get_level_values('date_time').time.astype(str))\n",
    "data.minute = (data.minute.dt.seconds.sub(data.minute.dt.seconds.min()).div(60).astype(int))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Lagged Returns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.051005Z",
     "start_time": "2020-06-20T16:19:41.420Z"
    }
   },
   "outputs": [],
   "source": [
    "for t in range(1, 11):\n",
    "    print(t, end=' ', flush=True)\n",
    "    data[f'ret{t}min'] = (df\n",
    "                          .sort_index()\n",
    "                          .groupby(['ticker', 'date'])\n",
    "                          .price\n",
    "                          .pct_change(periods=t, fill_method=None)\n",
    "                          .shift())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Forward Returns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.051725Z",
     "start_time": "2020-06-20T16:19:41.422Z"
    }
   },
   "outputs": [],
   "source": [
    "data['fwd1min'] = (data\n",
    "                   .sort_index()\n",
    "                   .groupby(['ticker', 'date'])\n",
    "                   .ret1min\n",
    "                   .shift(-1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.052433Z",
     "start_time": "2020-06-20T16:19:41.425Z"
    }
   },
   "outputs": [],
   "source": [
    "data.info(null_counts=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Normalized up/downtick volume"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.053232Z",
     "start_time": "2020-06-20T16:19:41.428Z"
    }
   },
   "outputs": [],
   "source": [
    "data['rup'] = df.rup.div(df.volume)\n",
    "data['up'] = df.up.div(df.volume)\n",
    "data['down'] = df.down.div(df.volume)\n",
    "data['rdown'] = df.rdown.div(df.volume)\n",
    "for f in ['up', 'down', 'rup', 'rdown']:\n",
    "    data[f] = data.groupby(['ticker', 'date'])[f].shift()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Balance of Power"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.053943Z",
     "start_time": "2020-06-20T16:19:41.431Z"
    }
   },
   "outputs": [],
   "source": [
    "data['BOP'] = (by_ticker\n",
    "               .apply(lambda x: talib.BOP(x['first'],\n",
    "                                          x.high,\n",
    "                                          x.low,\n",
    "                                          x['last'])\n",
    "                      .shift()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "###  Commodity Channel Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.054696Z",
     "start_time": "2020-06-20T16:19:41.434Z"
    }
   },
   "outputs": [],
   "source": [
    "data['CCI'] = (by_ticker\n",
    "               .apply(lambda x: talib.CCI(x.high,\n",
    "                                          x.low,\n",
    "                                          x['last'],\n",
    "                                          timeperiod=14).shift()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Money Flow Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.055393Z",
     "start_time": "2020-06-20T16:19:41.436Z"
    }
   },
   "outputs": [],
   "source": [
    "data['MFI'] = (by_ticker\n",
    "               .apply(lambda x: talib.MFI(x.high,\n",
    "                                          x.low,\n",
    "                                          x['last'],\n",
    "                                          x.volume,\n",
    "                                          timeperiod=14)\n",
    "                      .shift()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Stochastic RSI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.056443Z",
     "start_time": "2020-06-20T16:19:41.439Z"
    }
   },
   "outputs": [],
   "source": [
    "data['STOCHRSI'] = (by_ticker.apply(lambda x: talib.STOCHRSI(x['last'],\n",
    "                                                             timeperiod=14,\n",
    "                                                             fastk_period=14,\n",
    "                                                             fastd_period=3,\n",
    "                                                             fastd_matype=0)[0]\n",
    "                                    .shift()))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Stochastic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.057908Z",
     "start_time": "2020-06-20T16:19:41.442Z"
    }
   },
   "outputs": [],
   "source": [
    "def compute_stoch(x, fastk_period=14, slowk_period=3, \n",
    "                  slowk_matype=0, slowd_period=3, slowd_matype=0):\n",
    "    slowk, slowd = talib.STOCH(x.high, x.low, x['last'],\n",
    "                           fastk_period=fastk_period,\n",
    "                           slowk_period=slowk_period,\n",
    "                           slowk_matype=slowk_matype,\n",
    "                           slowd_period=slowd_period,\n",
    "                           slowd_matype=slowd_matype)\n",
    "    return (slowd/slowk-1).shift()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.059288Z",
     "start_time": "2020-06-20T16:19:41.444Z"
    }
   },
   "outputs": [],
   "source": [
    "data['STOCH'] = by_ticker.apply(compute_stoch).replace((np.inf, -np.inf), np.nan)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Transaction Volume by price point"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.060130Z",
     "start_time": "2020-06-20T16:19:41.447Z"
    }
   },
   "outputs": [],
   "source": [
    "data['trades_bid_ask'] = df.atask.sub(df.atbid).div(df.volume).replace((np.inf, -np.inf), np.nan)\n",
    "data['trades_bid_ask'] = data.groupby(['ticker', 'date']).trades_bid_ask.shift()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Evaluate features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.061000Z",
     "start_time": "2020-06-20T16:19:41.450Z"
    }
   },
   "outputs": [],
   "source": [
    "features = ['ret1min', 'ret2min', 'ret3min', 'ret4min', 'ret5min', \n",
    "            'ret6min', 'ret7min', 'ret8min', 'ret9min', 'ret10min',\n",
    "            'rup', 'up', 'down', 'rdown', \n",
    "            'BOP', 'CCI', 'MFI', 'STOCHRSI', 'STOCH', \n",
    "            'trades_bid_ask']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.061817Z",
     "start_time": "2020-06-20T16:19:41.452Z"
    }
   },
   "outputs": [],
   "source": [
    "ic = {}\n",
    "for feature in features:\n",
    "    print(feature)\n",
    "    df_ = data[['fwd1min', feature]].dropna()\n",
    "    ic[feature] = spearmanr(df_.fwd1min, df_[feature])\n",
    "ic = pd.Series(ic).apply(pd.Series)\n",
    "ic.columns = ['IC', 'p-value']    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.062630Z",
     "start_time": "2020-06-20T16:19:41.455Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "ic.sort_values('IC')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.063455Z",
     "start_time": "2020-06-20T16:19:41.459Z"
    }
   },
   "outputs": [],
   "source": [
    "ic.index = ic.index.map(str.upper)\n",
    "ax = ic['IC'].sort_values(ascending=False).mul(100).plot.bar(figsize=(14, 4), \n",
    "                                                        title='Information Coeficient for HF Features (1-min forward returns)',\n",
    "                                                       rot=0)\n",
    "ax.set_ylabel('Information Coefficient')\n",
    "plt.tight_layout()\n",
    "plt.savefig('figures/hft_ic', dpi=300);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.064294Z",
     "start_time": "2020-06-20T16:19:41.462Z"
    }
   },
   "outputs": [],
   "source": [
    "ic.sort_values('IC').to_csv('hf_ic.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Store results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.065152Z",
     "start_time": "2020-06-20T16:19:41.464Z"
    }
   },
   "outputs": [],
   "source": [
    "data.drop(['date', 'up', 'down'], axis=1).to_hdf('hf_data.h5', 'model_data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-06-20T16:19:42.066039Z",
     "start_time": "2020-06-20T16:19:41.467Z"
    }
   },
   "outputs": [],
   "source": [
    "data.info(null_counts=True)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": true,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
